Let’s format the names
data_all <- data_all %>%
mutate(name = gsub('_flipped', '', name),
name = gsub('_rc', '', name))
data_all <- data_all %>%
mutate(category = case_when(grepl('unscrambled', .$name) ~ 'unscramble',
grepl('neg', .$name) ~ 'negative',
TRUE ~ 'scramble'))
table(data_all$category)
negative scramble unscramble
553 51702 1796
data <- filter(data_all, category != 'negative')
data <- data %>%
separate(name, into = c('tss_name', 'tss_position', 'strand_scramble_loc'), sep = ',',
convert = T, remove = F) %>%
separate(strand_scramble_loc, into = c('strand', 'scramble_loc'), sep = '_') %>%
mutate(scramble_loc = gsub('unscrambled', NA, scramble_loc),
scramble_loc = gsub('scrambled', '', scramble_loc)) %>%
separate(scramble_loc, into = c('scramble_start', 'scramble_end'),
sep = '-', convert = T)
Expected 3 pieces. Missing pieces filled with `NA` in 6 rows [12856, 12912, 14805, 38073, 42258, 46862].
example <- filter(data, tss_name == 'TSS_2716_regulondb')
ggplot(example, aes(scramble_start, RNA_exp_ave)) + geom_point()+
labs(x = 'scramble start position (length 10bp)', y = 'expression')
negatives <- filter(data_all, category == 'negative')
neg_median <- median(negatives$RNA_exp_ave)
threshold <- 2 * neg_median
example <- example %>%
mutate(active = ifelse(RNA_exp_ave >= threshold, 'active', 'inactive'))
ggplot(example, aes(scramble_start, RNA_exp_ave)) + geom_point(aes(color = active)) +
scale_color_manual(values = c('red', 'black')) +
labs(x = 'scramble start position (length 10bp)', y = 'expression', color = '')
ggplot(data, aes(scramble_start, RNA_exp_ave)) + geom_point() +
labs(x = 'scramble start position (length 10bp)', 'expression') +
scale_y_log10()
Let’s calculate the expression of each scrambled sequence relative to the unscrambled sequence
data <- data %>%
group_by(tss_name) %>%
mutate(unscrambled_exp = ifelse(any(category == 'unscramble'),
RNA_exp_ave[category == 'unscramble'],
NA),
relative_exp = RNA_exp_ave / unscrambled_exp)
data <- data %>%
mutate(active = ifelse(RNA_exp_ave >= threshold, 'active', 'inactive'))
data %>%
ggplot(aes(scramble_start, relative_exp)) + geom_point(aes(color = active)) +
scale_color_manual(values = c('red', 'black')) +
scale_y_log10() +
labs(x = 'scramble start', y= 'relative expression', color = '')
data %>%
filter(tss_name == 'TSS_2716_regulondb') %>%
ggplot(aes(scramble_start, relative_exp)) + geom_point(aes(color = active)) +
scale_color_manual(values = c('red', 'black')) +
scale_y_log10() + geom_hline(yintercept = 1, linetype = 'dashed') +
labs(x = 'scramble start', y= 'relative expression', color = '')
data %>%
mutate(active_relative = ifelse(relative_exp >= 1, 'active', 'inactive')) %>%
group_by(active_relative) %>%
tally()
[38;5;246m# A tibble: 3 x 2[39m
active_relative n
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m active [4m2[24m[4m0[24m304
[38;5;250m2[39m inactive [4m2[24m[4m7[24m277
[38;5;250m3[39m [31mNA[39m [4m5[24m917
Only 68% of the library was mapped, so it seems many sequences are missing their unscrambled counterpart. After the next mapping run, we should get most of the library mapped. It’s probably best to classify each sequence as inactive/active relative to their unscrambled sequence. Anything with less expression than unscrambled will be “inactive” and anything higher as “active”.
corr <- cor(data$RNA_exp_1, data$RNA_exp_2)
ggplot(data, aes(RNA_exp_1, RNA_exp_2)) + geom_point() +
scale_x_log10() + scale_y_log10() + annotation_logticks(sides = 'bl') +
labs(x = 'biological replicate 1', y = 'biological replicate 2') +
annotate('text', x = 10, y = 0.1, label = paste0('r = ', signif(corr, 3)), size = 6)
Which k-mers are enriched in the active vs. inactive sequences? If the scrambled sequence has increased expression relative to unscrambled, we assume it is disrupting a repressive element. If the scrambled sequence has decreased expression relative to unscrambled, we assume it is disrupting an activating element.
bases <- c('A', 'T', 'G', 'C')
possible_kmers <- gtools::permutations(n = length(bases),
v = bases,
r = k,
repeats.allowed = T)
possible_kmers <- apply(possible_kmers, 1, paste, collapse='')
total_kmers_increased <- (150 - k + 1) * length(increased_set)
total_kmers_decreased <- (150 - k + 1) * length(decreased_set)
kmer_fisher <- function(kmer, df1, df2, df1_total, df2_total) {
df1_count <- df1[kmer]
df2_count <- df2[kmer]
mat <-matrix(c(df1_count, df2_count,
df1_total - df1_count,
df2_total - df2_count), nrow = 2)
test <- fisher.test(mat)
return(test)
}
tests <- mapply(kmer_fisher,
kmer = possible_kmers,
MoreArgs = list(
df1 = increased_kmer_freq,
df2 = decreased_kmer_freq,
df1_total = total_kmers_increased,
df2_total = total_kmers_decreased))
tests <- as.data.frame(t(tests))
tests$kmer <- rownames(tests)
tests <- tests %>%
mutate(p.value = as.numeric(p.value),
estimate = as.numeric(estimate),
p.value.adjusted = p.adjust(tests$p.value, method = 'fdr'))
signif_kmers_fdr <- tests %>%
filter(p.value.adjusted <= 0.05) %>%
arrange(p.value.adjusted) %>%
select(kmer, p.value, p.value.adjusted, estimate)
signif_kmers_fdr <- signif_kmers_fdr %>%
mutate(kmer_rc =
as.character(Biostrings::reverseComplement(
DNAStringSet(signif_kmers_fdr$kmer)))) %>%
select(kmer, kmer_rc, p.value:estimate)
signif_kmers_fdr %>%
filter(estimate < 1) %>% nrow()
[1] 543
signif_kmers_fdr %>%
filter(estimate > 1) %>% nrow
[1] 491
Do these k-mer match any TF PWMs for E. coli?
# http://regulondb.ccg.unam.mx/menu/download/datasets/files/BindingSiteSet.txt
tf_sites <- read.table('../../ref/regulondb_tfbs.txt', comment.char = '#',
header = F, sep = '\t',
col.names = c('tf_id', 'tf_name', 'tfbs_id', 'tfbs_left',
'tfbs_right', 'strand', 'tf_gene_id', 'tx_unit',
'expression_effect', 'promoter_name',
'center_pos_relative_tss', 'tfbs_sequence',
'evidence', 'evidence_confidence'))
# extract TFBS from sequence, in upper case
# grab upper case part of site corresponding to binding site
extract_upper <- function(string, toString) {
replace_lower <- strsplit(string, "[[:lower:]]*")[[1]]
only_upper <- replace_lower[replace_lower != ""]
if(toString == T) {
return(paste(only_upper, collapse = ''))
}
else{
return(only_upper)
}
}
tf_sites$tfbs <- unlist(lapply(tf_sites$tfbs_sequence, extract_upper, toString = T))
signif_kmers_fdr <- signif_kmers_fdr %>%
group_by(kmer) %>%
mutate(tf_match_most_common = ifelse(any(grep(kmer, tf_sites$tfbs)),
names(which.max(table(tf_sites$tf_name[grep(kmer,
tf_sites$tfbs)]))),
NA),
num_tf_match_most_common = ifelse(any(grep(kmer, tf_sites$tfbs)),
max(table(tf_sites$tf_name[grep(kmer, tf_sites$tfbs)])),
NA)) %>%
ungroup()
signif_kmers_fdr <- signif_kmers_fdr %>%
mutate(kmer_type = ifelse(estimate > 1, 'enriched', 'depleted'))
tf_counts <- signif_kmers_fdr %>%
group_by(kmer_type, tf_match_most_common) %>%
tally() %>%
arrange(desc(n))
tf_counts %>%
filter(n >= 10, !is.na(tf_match_most_common)) %>%
ggplot(aes(reorder(tf_match_most_common, n), n)) +
geom_bar(aes(fill = kmer_type), stat = 'identity', position = 'dodge') +
scale_fill_manual(values = c('darkred', 'darkgreen')) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10)) +
labs(x = 'TF', y = 'number of matching k-mers', fill = '')
NA
Most common TF is CRP, cAMP-activated global transcriptional regulator, which makes sense. Directly regulates transcription of ~300 genes in about 200 operons, indirectly regulates expression of about half the genome.
Next most common is Fis, DNA-binding protein Fis. Activates ribosomal RNA transcription, as well as other genes. Plays direct role in upstream activation of rRNA promoters.
Lrp, leucine-responsive regulatory protein. Mediates global response to leucine.
NarL, nitrate/nitrite response regulator protein, activates the expression of the nitrate reductase (narGHJI) and formate dehydrogenase-N (fdnGHI) operons and represses the transcription of the fumarate reductase (frdABCD) operon in response to a nitrate/nitrite induction signal transmitted by either the NarX or NarQ proteins.
Instead of doing an exact match between the k-mer and the TFBS, let’s create an HMM for each TF and use that to score each k-mer.
create_phmm <- function(df, tf, n_iter = 10, verbose = F) {
seq_list <- strsplit(toupper(filter(df, tf_name == tf)$tfbs), split = '')
# for some reason, maybe the way the sequences are randomly aligned, the same
# exact command will fail multiple times and then suceed. So, just keep trying
# until it works
phmm <- NULL
attempt <- 0
while(is.null(phmm) && attempt < n_iter) {
attempt <- attempt + 1
try (
phmm <- derivePHMM(seq_list, maxsize = max(unlist(lapply(seq_list, length))), quiet = !verbose),
silent = T
)
}
if(attempt > 1) {
print(paste("Number of attempts for", tf, ":", attempt))
}
if(is.null(phmm)) {
return(NA)
}
else{
return(phmm)
}
}
tf_sites_with_sequence <- filter(tf_sites, tfbs != '')
tf_list <- as.list(unique(tf_sites_with_sequence$tf_name))
tf_phmms <- lapply(tf_list, create_phmm, df = tf_sites_with_sequence)
[1] "Number of attempts for AcrR : 10"
[1] "Number of attempts for CpxR : 10"
[1] "Number of attempts for Cra : 10"
[1] "Number of attempts for CytR : 10"
[1] "Number of attempts for Fis : 10"
[1] "Number of attempts for FlhDC : 5"
[1] "Number of attempts for FliZ : 10"
[1] "Number of attempts for Fur : 10"
[1] "Number of attempts for H-NS : 10"
[1] "Number of attempts for IHF : 10"
[1] "Number of attempts for IscR : 10"
[1] "Number of attempts for LexA : 10"
[1] "Number of attempts for Lrp : 10"
[1] "Number of attempts for NsrR : 10"
[1] "Number of attempts for PhoB : 10"
[1] "Number of attempts for PhoP : 4"
[1] "Number of attempts for YdeO : 10"
missing_phmms <- lapply(missing_tfs, create_phmm, df = tf_sites_with_sequence, n_iter = 20)
[1] "Number of attempts for AcrR : 20"
[1] "Number of attempts for CpxR : 20"
[1] "Number of attempts for Cra : 20"
[1] "Number of attempts for CytR : 8"
[1] "Number of attempts for Fis : 20"
[1] "Number of attempts for FliZ : 20"
[1] "Number of attempts for Fur : 20"
[1] "Number of attempts for H-NS : 20"
[1] "Number of attempts for IHF : 20"
[1] "Number of attempts for IscR : 20"
[1] "Number of attempts for LexA : 20"
[1] "Number of attempts for NsrR : 20"
[1] "Number of attempts for PhoB : 20"
[1] "Number of attempts for YdeO : 20"
Why is this happening? Let’s take Fis as an example.
fis_sites <- filter(tf_sites_with_sequence, tf_name == 'Fis')$tfbs
fis_seq_list <- strsplit(toupper(fis_sites), split = '')
derivePHMM(fis_seq_list)
Deriving profile HMM
Refining model
Iteration 1: alignment with 269 rows & 52 columns, PHMM with 20 modules
Error in tmp == hashis : non-conformable arrays
The initial alignment attempt has 52 columns, and maybe the multiple sequence alignment is just too unstable. Is there varying binding site lengths?
fis_site_lengths <- unlist(lapply(fis_seq_list, length))
table(fis_site_lengths)
fis_site_lengths
15 19 20
243 25 1
Let’s only keep sites that are the predominant 15bp and see if this helps.
fis_seq_list_trimmed <- fis_seq_list[fis_site_lengths == 15]
derivePHMM(fis_seq_list_trimmed)
Deriving profile HMM
Refining model
Iteration 1: alignment with 243 rows & 15 columns, PHMM with 15 modules
Sequential alignments were identical after 1 iterations
Done
Profile hidden Markov model (object class: 'PHMM')
with 15 internal modules emitting 4 unique residues
(A, C, G, T).
If you trim the binding sites to only include 15bp, it works perfectly fine.
Hmm at least for the TFs that we couldn’t generate PHMMs, they all have at least two different binding site lengths. How true is this for all TFs?
Let’s see if there’s a way to tweak the alignment parameters before we just start eliminating sites.
derivePHMM(fis_seq_list, progressive = T, seeds = which(fis_site_lengths == 15)[1:25], maxsize=15)
Calculating sequence weights using maximum entropy method
Progressively aligning sequences
Deriving profile HMM
Refining model
Iteration 1: alignment with 269 rows & 48 columns, PHMM with 15 modules
Error in tmp == hashis : non-conformable arrays
Actually, let’s just use the position specific scoring matrices from RegulonDBs, then convert to PWMs.
python parse_regulondb_pssm.py ../../ref/regulondb_tf_pssm.txt regulondb_tf_pssm_parsed.txt
pwm_list <- PFMtoPWM(pfm_list, ids = as.character(tf_names))
Error in PWMUnscaled(motifs[[i]], id = id[i], name = name[i], ...) :
unused argument (ids = as.character(tf_names))
This actually won’t work because PWMs require the k-mer is at least as long as the PWM. Since we’re using 6-kmers and most PWMs are longer, this is a problem. Back to using PHMMs!
Let’s look at what k-mers are associated with relative expression, using simple linear regression. We only care about the significant ones.
kmer_counts <- kcount(x = strsplit(data$variant, split = ''), k = 6, residues = 'DNA')
kmer_counts_df <- data.frame(kmer_counts)
kmer_counts_df$expression <- data$relative_exp
Let’s first try individual linear regression for each k-mer and see how many are significant.
individ_kmer_reg <- function(kmer, kmer_counts_df) {
counts <- select(kmer_counts_df, kmer, expression)
model <- lm(log(expression) ~ ., counts)
summary <- summary(model)
# if k-mer count is too low, lm will return NA for coefficient
if(nrow(summary$coefficients) == 2) {
# get coefficient
coeff <- summary$coefficients[2, 1]
# get p-value of coefficient
pval <- summary$coefficients[2, 4]
return(list(c(coeff, pval)))
}
else {
return(list(c(NA, NA)))
}
}
# all_kmers <- as.list(colnames(kmer_counts))
# kmer_regression <- lapply(all_kmers, individ_kmer_reg, kmer_counts_df = kmer_counts_df)
# kmer_regression <- data.frame(matrix(unlist(kmer_regression), nrow = length(kmer_regression), byrow=T))
# colnames(kmer_regression) <- c('coeff', 'pval')
# kmer_regression$kmer <- unlist(all_kmers)
# save(kmer_regression, file = 'kmer_regression.rda')
load('kmer_regression.rda')
ggplot(kmer_regression, aes(log(pval))) + geom_histogram() +
geom_vline(xintercept = log(0.05), col = 'red') +
labs(x = 'log10(p-value) of linear regression coefficient',
title = 'Individual linear regression between\n k-mer and expression')
signif_kmers_lm_fdr <- kmer_regression %>%
mutate(pval_fdr = p.adjust(pval, method = 'fdr')) %>%
filter(pval_fdr <= 0.05)
print(nrow(signif_kmers_lm_fdr))
[1] 1252
K-mers to the left are significant.
# model_all_kmers <- lm(log(expression) ~ . , kmer_counts_df)
# saveRDS(model_all_kmers, file = 'model_all_kmers.rds', compress = T)
model_all_kmers <- readRDS('model_all_kmers.rds')
summary(model_all_kmers)
Call:
lm(formula = log(expression) ~ ., data = kmer_counts_df)
Residuals:
Min 1Q Median 3Q Max
-6.0489 -0.3032 0.0494 0.3603 3.9687
Coefficients: (11 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.4967005 1.4755591 -3.047 0.002309 **
AAAAAA 0.0346628 0.0158756 2.183 0.029011 *
AAAAAC 0.1034533 0.1427408 0.725 0.468601
AAAAAG 0.1104437 0.1343160 0.822 0.410929
AAAAAT 0.0038834 0.1158833 0.034 0.973267
AAAACA -0.0678518 0.1641119 -0.413 0.679280
AAAACC 0.0233996 0.1967573 0.119 0.905334
AAAACG -0.1696291 0.2156340 -0.787 0.431489
AAAACT -0.4432919 0.1844048 -2.404 0.016225 *
AAAAGA -0.2859106 0.1557424 -1.836 0.066395 .
AAAAGC -0.3636012 0.1743362 -2.086 0.037018 *
AAAAGG -0.1724975 0.2075501 -0.831 0.405915
AAAAGT -0.6326854 0.1466007 -4.316 0.000015944582 ***
AAAATA -0.0244477 0.1383692 -0.177 0.859757
AAAATC 0.0240097 0.1684269 0.143 0.886644
AAAATG -0.1048290 0.1390651 -0.754 0.450966
AAAATT -0.0024788 0.1344534 -0.018 0.985291
AAACAA -0.3024876 0.1832903 -1.650 0.098885 .
AAACAC 0.3997055 0.1866562 2.141 0.032247 *
AAACAG -0.1141133 0.1738206 -0.657 0.511506
AAACAT -0.0277251 0.1830403 -0.151 0.879606
AAACCA -0.1495554 0.2185344 -0.684 0.493754
AAACCC -0.2620934 0.2795264 -0.938 0.348438
AAACCG -0.8421313 0.3181962 -2.647 0.008134 **
AAACCT 0.5405033 0.2049551 2.637 0.008363 **
AAACGA 0.2403261 0.2207904 1.088 0.276389
AAACGC 0.0315989 0.2617952 0.121 0.903928
AAACGG 0.0149145 0.2898193 0.051 0.958958
AAACGT 0.1701203 0.2278278 0.747 0.455245
AAACTA 0.0857504 0.2132647 0.402 0.687624
AAACTC 0.2097300 0.2341909 0.896 0.370497
AAACTG 0.1532242 0.2269878 0.675 0.499659
AAACTT 0.2540304 0.1918823 1.324 0.185548
AAAGAA 0.7307900 0.1842917 3.965 0.000073390968 ***
AAAGAC 0.2510245 0.2151438 1.167 0.243307
AAAGAG 0.4474791 0.2070121 2.162 0.030654 *
AAAGAT 0.0320280 0.1766654 0.181 0.856139
AAAGCA 0.1507473 0.1803235 0.836 0.403169
AAAGCC -0.0915416 0.2013796 -0.455 0.649419
AAAGCG 0.3935789 0.1900547 2.071 0.038377 *
AAAGCT 0.8056594 0.3443603 2.340 0.019310 *
AAAGGA -0.0088592 0.2163742 -0.041 0.967341
AAAGGC -0.0976985 0.2993955 -0.326 0.744184
AAAGGG 0.1872420 0.2587234 0.724 0.469245
AAAGGT 0.2279107 0.2753221 0.828 0.407790
AAAGTA -0.1064875 0.2681563 -0.397 0.691288
AAAGTC 0.0947602 0.3577904 0.265 0.791128
AAAGTG 0.3497714 0.2219549 1.576 0.115064
AAAGTT 0.5429140 0.2754658 1.971 0.048742 *
AAATAA -0.0495951 0.1488327 -0.333 0.738964
AAATAC -0.1149441 0.2338122 -0.492 0.622999
AAATAG 0.0004822 0.1718163 0.003 0.997761
AAATAT 0.0135287 0.1943632 0.070 0.944508
AAATCA 0.1565364 0.1860283 0.841 0.400092
AAATCC -0.0232291 0.2099415 -0.111 0.911898
AAATCG -0.3032418 0.2235140 -1.357 0.174883
AAATCT 0.1074538 0.2052164 0.524 0.600551
AAATGA -0.0542237 0.1601893 -0.338 0.734990
AAATGC 0.4268177 0.1488635 2.867 0.004144 **
AAATGG 0.0727386 0.1600142 0.455 0.649417
AAATGT 0.1869129 0.1683267 1.110 0.266825
AAATTA 0.1321316 0.1528389 0.865 0.387309
AAATTC 0.0865200 0.1935651 0.447 0.654891
AAATTG -0.1814945 0.1865643 -0.973 0.330645
AAATTT 0.0223342 0.1457094 0.153 0.878179
AACAAA 0.1920313 0.2017633 0.952 0.341221
AACAAC -0.1135769 0.2760572 -0.411 0.680763
AACAAG 0.4150624 0.2554878 1.625 0.104258
AACAAT 0.3141905 0.1915233 1.640 0.100912
AACACA -0.3005835 0.2437656 -1.233 0.217551
AACACC -0.2624175 0.2468887 -1.063 0.287834
AACACG -1.2119097 0.5506119 -2.201 0.027740 *
AACACT -0.4387450 0.2218416 -1.978 0.047964 *
AACAGA -0.2770701 0.2239493 -1.237 0.216019
AACAGC -0.3241661 0.1973328 -1.643 0.100444
AACAGG -0.3793827 0.1984238 -1.912 0.055885 .
AACAGT -0.4021401 0.2596114 -1.549 0.121387
AACATA -0.1738631 0.2073920 -0.838 0.401850
AACATC 0.0811992 0.1814264 0.448 0.654473
AACATG 0.3066691 0.2158485 1.421 0.155393
AACATT 0.2244095 0.1818444 1.234 0.217182
AACCAA 0.0170356 0.4520103 0.038 0.969936
AACCAC -0.2502666 0.2268100 -1.103 0.269851
AACCAG -0.0817182 0.3770534 -0.217 0.828421
AACCAT -0.1657344 0.2944852 -0.563 0.573578
AACCCA -0.0608393 0.2911965 -0.209 0.834505
AACCCC -0.1184167 0.3260446 -0.363 0.716463
AACCCG 0.9098805 0.4111008 2.213 0.026884 *
AACCCT 0.6487052 0.7643426 0.849 0.396047
AACCGA 0.8936099 0.4042521 2.211 0.027074 *
AACCGC 1.3117655 0.4219952 3.108 0.001882 **
AACCGG 1.1413546 0.7986643 1.429 0.152989
AACCGT 0.4835298 0.3722034 1.299 0.193916
AACCTA -0.6859491 0.5353116 -1.281 0.200060
AACCTC 0.2110583 0.7462665 0.283 0.777317
AACCTG -0.7078331 0.2392125 -2.959 0.003088 **
AACCTT 0.0535798 0.7348884 0.073 0.941879
AACGAA -0.4525533 0.1979641 -2.286 0.022257 *
AACGAC -0.3594477 0.7585636 -0.474 0.635607
AACGAG -0.2801743 0.2494505 -1.123 0.261373
AACGAT -0.1215099 0.1855301 -0.655 0.512514
AACGCA -0.2286689 0.3115990 -0.734 0.463040
AACGCC -0.0291978 0.2992390 -0.098 0.922271
AACGCG 0.1849535 0.2290527 0.807 0.419399
AACGCT -0.0445065 0.2477571 -0.180 0.857438
AACGGA 0.2861158 0.3298880 0.867 0.385776
AACGGC 0.0870005 0.5747537 0.151 0.879684
AACGGG -0.7360682 0.3422389 -2.151 0.031502 *
AACGGT -0.3458252 0.3041472 -1.137 0.255531
AACGTA -0.4965596 0.2592710 -1.915 0.055472 .
AACGTC 0.0341153 0.3948919 0.086 0.931156
AACGTG -0.2110725 0.4500381 -0.469 0.639065
AACGTT -0.2711435 0.1964192 -1.380 0.167461
AACTAA 0.2112061 0.2391058 0.883 0.377070
AACTAC 1.0088608 0.3150062 3.203 0.001363 **
AACTAG 1.3734763 0.5574378 2.464 0.013747 *
AACTAT 0.5921140 0.2111937 2.804 0.005055 **
AACTCA 0.0301969 0.2751058 0.110 0.912596
AACTCC -0.4002988 0.3188215 -1.256 0.209283
AACTCG 0.1566576 0.2963248 0.529 0.597038
AACTCT 1.0830455 0.2770798 3.909 0.000092901833 ***
AACTGA 0.0948675 0.2543229 0.373 0.709135
AACTGC 0.2690480 0.2341675 1.149 0.250581
AACTGG 0.8542891 0.3490263 2.448 0.014384 *
AACTGT 0.2101331 0.2523040 0.833 0.404930
AACTTA 0.4309970 0.1933037 2.230 0.025777 *
AACTTC -0.1539476 0.2289870 -0.672 0.501397
AACTTG 0.0800730 0.5250208 0.153 0.878782
AACTTT 0.7029736 0.1810964 3.882 0.000104 ***
AAGAAA -0.6761689 0.1779309 -3.800 0.000145 ***
AAGAAC -0.7236264 0.2045952 -3.537 0.000405 ***
AAGAAG -0.3197459 0.2001810 -1.597 0.110210
AAGAAT -0.6813880 0.2066171 -3.298 0.000975 ***
AAGACA -0.6981306 0.5471585 -1.276 0.201990
AAGACC -1.4004383 0.3852624 -3.635 0.000278 ***
AAGACG -0.1895385 0.2289114 -0.828 0.407675
AAGACT 0.1414009 0.2519421 0.561 0.574634
AAGAGA -0.2321768 0.2599642 -0.893 0.371803
AAGAGC -0.2214761 0.2384291 -0.929 0.352948
AAGAGG -0.1581750 0.2504908 -0.631 0.527743
AAGAGT -0.3384405 0.2222687 -1.523 0.127850
AAGATA -0.0636478 0.1982384 -0.321 0.748161
AAGATC 0.4533621 0.2086107 2.173 0.029767 *
AAGATG 0.1878509 0.2236342 0.840 0.400918
AAGATT 0.3012269 0.1960917 1.536 0.124508
AAGCAA 0.0939013 0.1797835 0.522 0.601463
AAGCAC 0.0537352 0.1985355 0.271 0.786655
AAGCAG 0.4218254 0.2302250 1.832 0.066924 .
AAGCAT 0.3714247 0.2229722 1.666 0.095763 .
AAGCCA 0.4864016 0.1860518 2.614 0.008943 **
AAGCCC 0.3843087 0.2722818 1.411 0.158123
AAGCCG 0.2187312 0.2341019 0.934 0.350133
AAGCCT 0.0934850 0.2449765 0.382 0.702754
AAGCGA -0.3886433 0.1760524 -2.208 0.027281 *
AAGCGC -0.2316992 0.2269776 -1.021 0.307354
AAGCGG -0.0534090 0.2455940 -0.217 0.827844
AAGCGT -0.0622070 0.2285067 -0.272 0.785444
AAGCTA -0.5494474 0.3819475 -1.439 0.150288
AAGCTC -1.4675030 0.3774226 -3.888 0.000101 ***
AAGCTG -0.1064719 0.6111731 -0.174 0.861702
AAGCTT -0.7959320 0.3464732 -2.297 0.021610 *
AAGGAA 0.0999732 0.2048616 0.488 0.625550
AAGGAC -1.2837388 0.7501535 -1.711 0.087033 .
AAGGAG 0.2429174 0.1880737 1.292 0.196500
AAGGAT -0.0331754 0.2234415 -0.148 0.881969
AAGGCA -0.0621059 0.2837675 -0.219 0.826759
AAGGCC 0.6177923 0.3385671 1.825 0.068049 .
AAGGCG 0.1623371 0.2935539 0.553 0.580262
AAGGCT 0.3844371 0.3117596 1.233 0.217538
AAGGGA 0.0805338 0.2771780 0.291 0.771398
AAGGGC -0.1490703 0.3440801 -0.433 0.664840
AAGGGG 0.0181695 0.2649113 0.069 0.945319
AAGGGT -0.4096410 0.2311405 -1.772 0.076358 .
AAGGTA -0.1189957 0.2844228 -0.418 0.675674
AAGGTC 0.0171191 0.3003201 0.057 0.954543
AAGGTG -0.4434591 0.3331896 -1.331 0.183212
AAGGTT -0.3034563 0.2430401 -1.249 0.211823
AAGTAA 1.0599105 0.2877806 3.683 0.000231 ***
AAGTAC 0.0846495 0.3702650 0.229 0.819166
AAGTAG 0.7045046 0.3481783 2.023 0.043038 *
AAGTAT 0.6707056 0.2926125 2.292 0.021903 *
AAGTCA 0.9830651 0.3933616 2.499 0.012453 *
AAGTCC 0.3392590 0.3836665 0.884 0.376563
AAGTCG 0.5987267 0.4343519 1.378 0.168075
AAGTCT 0.2961262 0.4193031 0.706 0.480046
AAGTGA -0.5937189 0.2590594 -2.292 0.021920 *
AAGTGC 0.4859465 0.2969536 1.636 0.101755
AAGTGG 0.4163521 0.2674746 1.557 0.119572
AAGTGT 0.1751103 0.2808230 0.624 0.532919
AAGTTA -0.0470287 0.2929976 -0.161 0.872481
AAGTTC -0.0640105 0.2868914 -0.223 0.823445
AAGTTG 0.3392449 0.7703622 0.440 0.659671
AAGTTT -0.1734642 0.2824102 -0.614 0.539068
AATAAA 0.3020764 0.1318247 2.292 0.021939 *
AATAAC 0.1044976 0.1358761 0.769 0.441859
AATAAG -0.0286992 0.1522021 -0.189 0.850439
AATAAT 0.2697484 0.1381288 1.953 0.050841 .
AATACA 0.3997844 0.2482082 1.611 0.107256
AATACC 0.6503354 0.3794411 1.714 0.086549 .
AATACG 0.1875402 0.2609151 0.719 0.472281
[ reached getOption("max.print") -- omitted 3897 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.696 on 43495 degrees of freedom
(5917 observations deleted due to missingness)
Multiple R-squared: 0.267, Adjusted R-squared: 0.1982
F-statistic: 3.879 on 4085 and 43495 DF, p-value: < 0.00000000000000022
Let’s see how many significant k-mers from individual linear regression compared to those from Fisher test enrichments.
coeff <- summary(model_all_kmers)$coefficients
coeff <- coeff[-1,]
signif_kmers_multiple_lm <- unlist(dimnames(coeff)[1][coeff[,4] <= 0.05])
# how many significant in both?
signif_kmers_overlap <- signif_kmers_fdr %>%
left_join(data.frame(kmer = signif_kmers_multiple_lm, lm_coeff = coeff[,1], lm_pval = coeff[,4]),
by = 'kmer') %>%
filter(!is.na(lm_coeff))
print(nrow(signif_kmers_overlap) / nrow(signif_kmers_fdr))
[1] 0.9961315
Good, most of them. How do the direction of their effects compare?
Hmm they don’t always match up, I’ll go with the linear regression more than the Fisher because it doesn’t have any classification into “active” or “inactive” and is just based on the relative expression value.
Let’s take a quick look at the coefficients.
What are these k-mers? Do they hit TFs?
signif_kmers_lm_fdr <- signif_kmers_lm_fdr %>%
group_by(kmer) %>%
mutate(tf_match_most_common = ifelse(any(grep(kmer, tf_sites$tfbs_sequence)),
names(which.max(
table(
tf_sites$tf_name[
grep(kmer, tf_sites$tfbs_sequence)])
)),
NA)
)
What are the range of coefficients for each TF?
Negative coefficients implies k-mer count is inversely correlated with expression. Positive coefficeints implies k-mer count is positively correlated with expression. For example,
Let’s run multiple regression again but with only the significant k-mers.
kmer_counts_signif <- select_(kmer_counts_df, .dots = signif_kmers_lm_fdr$kmer)
kmer_counts_signif$expression <- kmer_counts_df$expression
model_signif_kmers <- lm(log(expression) ~ . , kmer_counts_signif)
summary(model_signif_kmers)
Call:
lm(formula = log(expression) ~ ., data = kmer_counts_signif)
Residuals:
Min 1Q Median 3Q Max
-6.3819 -0.2844 0.0768 0.3766 4.3248
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.30890352 0.03801217 -8.126 0.000000000000000453 ***
AAAAAA 0.00434197 0.00822279 0.528 0.597473
AAAAAC -0.00648140 0.01713670 -0.378 0.705271
AAAAAG -0.01823988 0.01685400 -1.082 0.279157
AAAAAT 0.00055872 0.01387324 0.040 0.967876
AAAACG -0.05840539 0.02459502 -2.375 0.017568 *
AAAAGC 0.02058264 0.01974592 1.042 0.297244
AAAAGG 0.04480753 0.02013851 2.225 0.026088 *
AAAAGT -0.02483951 0.01801910 -1.379 0.168053
AAAATT -0.04825498 0.01403867 -3.437 0.000588 ***
AAACAG -0.04943112 0.01927990 -2.564 0.010354 *
AAACCA -0.01669646 0.01992655 -0.838 0.402091
AAACCT 0.03456437 0.02629653 1.314 0.188715
AAACGA 0.04281810 0.03757260 1.140 0.254455
AAACGC 0.06223080 0.02786662 2.233 0.025543 *
AAACGT -0.00122020 0.02976225 -0.041 0.967298
AAACTG -0.02401282 0.02372869 -1.012 0.311556
AAAGCA 0.01479123 0.02209281 0.670 0.503177
AAAGCG -0.00820831 0.02246070 -0.365 0.714775
AAAGGA 0.02891345 0.02280243 1.268 0.204805
AAAGGC -0.03630259 0.02463383 -1.474 0.140572
AAAGTA -0.00962764 0.02000412 -0.481 0.630318
AAATAT 0.02504872 0.01654050 1.514 0.129935
AAATCC 0.00513753 0.02003686 0.256 0.797640
AAATGC -0.08065266 0.01724900 -4.676 0.000002936527468966 ***
AAATTT -0.00843376 0.01589396 -0.531 0.595680
AACAAA 0.02406502 0.02229136 1.080 0.280340
AACAAC -0.01794779 0.02433063 -0.738 0.460723
AACAAT 0.04620472 0.02473605 1.868 0.061781 .
AACACC 0.04517806 0.02700382 1.673 0.094329 .
AACACG -0.03392832 0.03825992 -0.887 0.375199
AACATA 0.03411380 0.02079680 1.640 0.100942
AACATC 0.02521342 0.02015173 1.251 0.210875
AACATG 0.02686663 0.02213418 1.214 0.224828
AACATT 0.01117927 0.01802217 0.620 0.535059
AACCAA -0.02129134 0.02794335 -0.762 0.446096
AACCGA -0.12277487 0.03535238 -3.473 0.000515 ***
AACCTA -0.00192959 0.03293139 -0.059 0.953276
AACCTC 0.09657681 0.03573622 2.702 0.006885 **
AACCTT 0.01829238 0.02365405 0.773 0.439331
AACGAA -0.01133738 0.03810072 -0.298 0.766038
AACGAC -0.01758595 0.04583774 -0.384 0.701235
AACGAT -0.03884870 0.03905207 -0.995 0.319842
AACGCG -0.00121389 0.02777880 -0.044 0.965145
AACGTG -0.09291112 0.02761440 -3.365 0.000767 ***
AACGTT 0.04041376 0.02584499 1.564 0.117895
AACTCA -0.05497432 0.02591892 -2.121 0.033926 *
AACTCG -0.05188727 0.03840538 -1.351 0.176689
AACTCT 0.07873203 0.02880000 2.734 0.006264 **
AACTGA -0.00788876 0.02682936 -0.294 0.768733
AACTGC 0.02298311 0.02925435 0.786 0.432088
AACTTG -0.03681373 0.02841400 -1.296 0.195113
AAGAAC -0.04839534 0.02945740 -1.643 0.100412
AAGAAG -0.04908516 0.02391094 -2.053 0.040094 *
AAGACA 0.04160404 0.02914681 1.427 0.153472
AAGAGC 0.05419296 0.02892047 1.874 0.060956 .
AAGATA -0.06345340 0.01995376 -3.180 0.001474 **
AAGATC 0.01243106 0.02883235 0.431 0.666361
AAGCAA -0.00503979 0.02096836 -0.240 0.810058
AAGCAC 0.00830795 0.02896168 0.287 0.774221
AAGCGG -0.03243257 0.02771368 -1.170 0.241897
AAGCTA -0.05348907 0.02734676 -1.956 0.050476 .
AAGGCC 0.03072505 0.03871857 0.794 0.427463
AAGGCT 0.02572041 0.03407941 0.755 0.450421
AAGGGC 0.01050877 0.03134959 0.335 0.737466
AAGGGG 0.01863515 0.03114720 0.598 0.549647
AAGGTT -0.00865767 0.02370239 -0.365 0.714915
AAGTTT -0.00822470 0.02060736 -0.399 0.689811
AATACA 0.06263944 0.02260041 2.772 0.005580 **
AATAGA 0.00613824 0.02350861 0.261 0.794012
AATAGT -0.00269683 0.02427407 -0.111 0.911538
AATATA 0.01170722 0.02063295 0.567 0.570442
AATATC -0.04642274 0.01942804 -2.389 0.016877 *
AATATG -0.00405110 0.02141826 -0.189 0.849982
AATCAC -0.05200319 0.02247747 -2.314 0.020696 *
AATCCT -0.01738987 0.02406288 -0.723 0.469877
AATCTA -0.01112128 0.02896954 -0.384 0.701058
AATGCG 0.01753381 0.02059933 0.851 0.394672
AATGGC -0.00855939 0.02099067 -0.408 0.683444
AATGGT 0.00444812 0.02504891 0.178 0.859056
AATGTA -0.01322579 0.02108719 -0.627 0.530534
AATGTG 0.03257457 0.01974388 1.650 0.098979 .
AATTAT 0.05760757 0.01948958 2.956 0.003120 **
AATTCC 0.00199822 0.02249287 0.089 0.929211
AATTTA -0.02549466 0.01509163 -1.689 0.091164 .
AATTTC -0.03394645 0.01915121 -1.773 0.076310 .
AATTTG 0.05245184 0.02080484 2.521 0.011701 *
AATTTT -0.01220794 0.01465427 -0.833 0.404813
ACAAAA -0.00481080 0.02099861 -0.229 0.818792
ACAAAT -0.03089400 0.02214993 -1.395 0.163093
ACAACG -0.05579025 0.02709751 -2.059 0.039512 *
ACAACT -0.01977051 0.03101628 -0.637 0.523852
ACAATA -0.04237639 0.02549661 -1.662 0.096511 .
ACAATC -0.05506017 0.02810526 -1.959 0.050111 .
ACAATT -0.00763583 0.02522743 -0.303 0.762135
ACACAT -0.05413574 0.02606310 -2.077 0.037798 *
ACACCG 0.02993431 0.03245021 0.922 0.356289
ACACGA -0.00061195 0.04356299 -0.014 0.988792
ACACGC 0.07264561 0.04209101 1.726 0.084369 .
ACACGG 0.02781476 0.04475326 0.622 0.534265
ACAGAC -0.06537753 0.03112026 -2.101 0.035664 *
ACAGCA 0.02093989 0.03403850 0.615 0.538437
ACAGCT 0.07806835 0.03229509 2.417 0.015638 *
ACATAA -0.02485295 0.02146362 -1.158 0.246907
ACATAC -0.10612408 0.02851754 -3.721 0.000198 ***
ACATAG 0.07248259 0.03297854 2.198 0.027963 *
ACATCT 0.06278010 0.02292731 2.738 0.006180 **
ACATGG -0.04481365 0.03063662 -1.463 0.143543
ACATTG 0.07618923 0.02387658 3.191 0.001419 **
ACCAAA -0.05406428 0.02520540 -2.145 0.031962 *
ACCAAG -0.07523934 0.04202514 -1.790 0.073406 .
ACCACG -0.08155554 0.03775269 -2.160 0.030758 *
ACCATA -0.01283445 0.02772975 -0.463 0.643481
ACCATC 0.03536823 0.02802673 1.262 0.206974
ACCCCA -0.07591990 0.03328358 -2.281 0.022553 *
ACCCCG -0.01006831 0.03944345 -0.255 0.798524
ACCGAA 0.09070944 0.03549101 2.556 0.010596 *
ACCGAC 0.05720835 0.03422823 1.671 0.094654 .
ACCGGC -0.02336629 0.03874509 -0.603 0.546460
ACCGGT -0.05577167 0.04270943 -1.306 0.191614
ACCGTA -0.08198917 0.03326271 -2.465 0.013709 *
ACCTAC 0.04950669 0.04404683 1.124 0.261038
ACCTCT -0.10620800 0.03338804 -3.181 0.001469 **
ACCTGT 0.01400613 0.02526661 0.554 0.579353
ACCTTC 0.01274901 0.03067254 0.416 0.677669
ACGAAC -0.06744141 0.03651479 -1.847 0.064759 .
ACGAAG 0.03062168 0.03409718 0.898 0.369153
ACGAAT 0.00740583 0.02746450 0.270 0.787430
ACGACA 0.01031014 0.04044516 0.255 0.798789
ACGACG -0.10603092 0.04033904 -2.628 0.008579 **
ACGATC 0.08554000 0.03567667 2.398 0.016505 *
ACGATG 0.03137694 0.03339516 0.940 0.347445
ACGATT -0.01635874 0.02763742 -0.592 0.553917
ACGCCA -0.04963752 0.02481847 -2.000 0.045504 *
ACGCTA -0.08941312 0.03251800 -2.750 0.005968 **
ACGCTG -0.05568801 0.02747102 -2.027 0.042652 *
ACGCTT 0.03900611 0.02705346 1.442 0.149361
ACGGAC -0.06699623 0.04227978 -1.585 0.113066
ACGGAT 0.01423083 0.03288796 0.433 0.665230
ACGGCT 0.04952514 0.03102412 1.596 0.110419
ACGGGG -0.04838074 0.03819598 -1.267 0.205289
ACGTCA 0.05075582 0.02729165 1.860 0.062926 .
ACGTCG 0.06486230 0.03972145 1.633 0.102491
ACGTTA -0.08210589 0.02602692 -3.155 0.001608 **
ACGTTC 0.04557785 0.03730121 1.222 0.221757
ACGTTT -0.04551824 0.02416693 -1.883 0.059640 .
ACTAAG 0.08649737 0.03817665 2.266 0.023473 *
ACTACA -0.06225711 0.03095016 -2.012 0.044276 *
ACTACC 0.05860704 0.03598821 1.629 0.103424
ACTATC 0.01330288 0.03581568 0.371 0.710322
ACTCGA -0.00908249 0.05129361 -0.177 0.859455
ACTCTG -0.00009765 0.03276199 -0.003 0.997622
ACTGAG 0.13395064 0.05186372 2.583 0.009805 **
ACTGCG -0.08460424 0.03165682 -2.673 0.007530 **
ACTGCT -0.07188756 0.03205382 -2.243 0.024920 *
ACTGGA -0.03236605 0.02565908 -1.261 0.207176
ACTTAT -0.00716207 0.02011854 -0.356 0.721847
ACTTCG 0.04137434 0.03069999 1.348 0.177762
ACTTTA 0.02251267 0.01777773 1.266 0.205397
ACTTTG -0.06499289 0.02253628 -2.884 0.003929 **
AGAAAA -0.01519460 0.01696324 -0.896 0.370398
AGAACA -0.03934841 0.02752780 -1.429 0.152894
AGAATA -0.04364856 0.02021496 -2.159 0.030838 *
AGAATC 0.02574459 0.02542278 1.013 0.311229
AGACAA -0.05286864 0.02969818 -1.780 0.075050 .
AGACAG -0.03755090 0.03275858 -1.146 0.251680
AGACGA -0.08303362 0.03298838 -2.517 0.011837 *
AGACTT 0.02300946 0.02586056 0.890 0.373604
AGAGTG 0.06188758 0.02508691 2.467 0.013631 *
AGATAC 0.00360332 0.03449160 0.104 0.916797
AGATCG 0.08286773 0.03057748 2.710 0.006729 **
AGATGA -0.10786633 0.08080641 -1.335 0.181924
AGATTA 0.00701791 0.02066421 0.340 0.734147
AGATTG 0.04448671 0.02138419 2.080 0.037498 *
AGCAAG -0.05697774 0.02637486 -2.160 0.030754 *
AGCACA -0.05033715 0.03209398 -1.568 0.116788
AGCACT -0.03847891 0.03254978 -1.182 0.237150
AGCAGA -0.04329866 0.02820399 -1.535 0.124742
AGCAGG 0.06610018 0.02693366 2.454 0.014124 *
AGCAGT -0.07334428 0.02886027 -2.541 0.011045 *
AGCCAT -0.06449470 0.02498634 -2.581 0.009849 **
AGCCGG -0.04520250 0.03592772 -1.258 0.208344
AGCGAA -0.03712854 0.02393078 -1.551 0.120789
AGCGAG -0.09351445 0.03413552 -2.740 0.006156 **
AGCGGC -0.04049497 0.03447445 -1.175 0.240146
AGCGTT -0.02598932 0.02718647 -0.956 0.339095
AGCTGT 0.08566730 0.03115481 2.750 0.005967 **
AGCTTG 0.00520686 0.03463014 0.150 0.880484
AGGAAA -0.00869007 0.02139139 -0.406 0.684567
AGGACC -0.01243861 0.04955451 -0.251 0.801809
AGGAGC 0.03082679 0.04143346 0.744 0.456876
AGGAGG -0.03922422 0.04447811 -0.882 0.377848
AGGAGT 0.00207631 0.03011099 0.069 0.945026
AGGATG 0.08959672 0.02969274 3.017 0.002550 **
AGGATT 0.02931155 0.02612289 1.122 0.261841
AGGCAA 0.00842560 0.02732757 0.308 0.757841
AGGCAG 0.05865355 0.03266642 1.796 0.072576 .
AGGCCA -0.03264948 0.03914308 -0.834 0.404226
AGGCCG -0.10639063 0.03599921 -2.955 0.003125 **
AGGCCT -0.07423212 0.04013318 -1.850 0.064371 .
[ reached getOption("max.print") -- omitted 1053 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7249 on 46328 degrees of freedom
(5917 observations deleted due to missingness)
Multiple R-squared: 0.1529, Adjusted R-squared: 0.13
F-statistic: 6.679 on 1252 and 46328 DF, p-value: < 0.00000000000000022
Hmm, the model does worse, which isn’t too surprising since we remove a lot of variables.
signif_kmers_lm_fdr %>%
group_by(tf_match_most_common) %>%
summarise(n = n()) %>%
arrange(desc(n))
Let’s take k-mers that match to top 5 TFs and see what this small model looks like.
top_tfs <- signif_kmers_lm_fdr %>%
group_by(tf_match_most_common) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
top_n(5) %>% .$tf_match_most_common
Selecting by n
kmer_tf_subset <- select_(kmer_counts_df, .dots =
filter(signif_kmers_lm_fdr, tf_match_most_common %in% top_tfs) %>% .$kmer)
kmer_tf_subset$expression <- kmer_counts_df$expression
model_tf_subset <- lm(expression ~ . , kmer_tf_subset)
summary(model_tf_subset)
Call:
lm(formula = expression ~ ., data = kmer_tf_subset)
Residuals:
Min 1Q Median 3Q Max
-2.038 -0.451 -0.144 0.155 56.725
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.9749748 0.0401750 24.268 < 0.0000000000000002 ***
AAAAAA -0.0061672 0.0143182 -0.431 0.666673
AAAAAC -0.0036803 0.0283064 -0.130 0.896553
AAAAAG -0.0301875 0.0268088 -1.126 0.260159
AAAAAT 0.0070495 0.0227474 0.310 0.756637
AAAAGT -0.0392561 0.0305887 -1.283 0.199375
AAACAG -0.0271519 0.0320349 -0.848 0.396679
AAACGT -0.0683979 0.0380641 -1.797 0.072356 .
AAACTG -0.0590590 0.0358051 -1.649 0.099061 .
AAAGCG -0.0258121 0.0329163 -0.784 0.432943
AAAGGA 0.0316571 0.0377590 0.838 0.401811
AAAGTA 0.0042980 0.0339669 0.127 0.899309
AAATAT 0.0324755 0.0245361 1.324 0.185650
AAATCC -0.0479755 0.0338925 -1.416 0.156923
AAATGC -0.0203362 0.0284134 -0.716 0.474164
AAATTT 0.0568125 0.0239034 2.377 0.017470 *
AACAAA -0.0347280 0.0325130 -1.068 0.285469
AACACC 0.0329913 0.0463548 0.712 0.476645
AACACG -0.0882816 0.0552528 -1.598 0.110099
AACATC 0.0805475 0.0337698 2.385 0.017074 *
AACATG -0.0014276 0.0375766 -0.038 0.969694
AACATT -0.0065428 0.0301882 -0.217 0.828416
AACCAA -0.0369513 0.0434694 -0.850 0.395300
AACCTC 0.1151940 0.0590894 1.949 0.051243 .
AACGAA -0.0177052 0.0412088 -0.430 0.667455
AACGTG -0.0487307 0.0462709 -1.053 0.292272
AACTCA -0.0326836 0.0434917 -0.751 0.452362
AACTGA -0.0369097 0.0437235 -0.844 0.398583
AAGATC 0.0140474 0.0490131 0.287 0.774416
AAGGCC -0.0766659 0.0575683 -1.332 0.182953
AATAGA -0.0128498 0.0391495 -0.328 0.742743
AATATG 0.0060841 0.0337136 0.180 0.856790
AATCAC 0.0232921 0.0377294 0.617 0.537010
AATCCT 0.0834822 0.0398608 2.094 0.036235 *
AATGTG 0.0587228 0.0327984 1.790 0.073394 .
AATTTC -0.0160975 0.0284415 -0.566 0.571405
AATTTG 0.0044112 0.0330391 0.134 0.893787
AATTTT 0.0124287 0.0219368 0.567 0.571011
ACAAAA -0.0266217 0.0351010 -0.758 0.448198
ACAAAT -0.0175117 0.0368641 -0.475 0.634765
ACAACT -0.0393199 0.0433222 -0.908 0.364087
ACAATA -0.0060215 0.0305708 -0.197 0.843853
ACAATT 0.0206947 0.0301875 0.686 0.493007
ACACAT -0.0615743 0.0455523 -1.352 0.176468
ACACCG 0.1413947 0.0494335 2.860 0.004234 **
ACACGC 0.0091023 0.0669598 0.136 0.891872
ACAGAC -0.0762503 0.0489867 -1.557 0.119584
ACAGCA -0.0630842 0.0403412 -1.564 0.117879
ACAGCT 0.1779348 0.0564735 3.151 0.001629 **
ACATAA -0.0615471 0.0311053 -1.979 0.047859 *
ACATAG 0.0745688 0.0546597 1.364 0.172499
ACATCT 0.1008930 0.0383880 2.628 0.008586 **
ACATGG -0.0726741 0.0508539 -1.429 0.152989
ACATTG 0.0781677 0.0388967 2.010 0.044477 *
ACCAAA -0.0290820 0.0404260 -0.719 0.471904
ACCAAG -0.0094985 0.0709589 -0.134 0.893514
ACCATA 0.0529996 0.0416210 1.273 0.202888
ACCATC -0.0117153 0.0451365 -0.260 0.795209
ACCCCA 0.0739384 0.0519730 1.423 0.154850
ACCCCG -0.0277527 0.0626101 -0.443 0.657578
ACCGAA -0.0525009 0.0593721 -0.884 0.376555
ACCGGC -0.0773834 0.0606885 -1.275 0.202283
ACCGTA -0.1622923 0.0563186 -2.882 0.003957 **
ACCTCT -0.0763415 0.0564817 -1.352 0.176505
ACGAAT 0.0380689 0.0437921 0.869 0.384683
ACGACA 0.0579181 0.0647266 0.895 0.370893
ACGACG -0.0220005 0.0669070 -0.329 0.742292
ACGATC -0.0008746 0.0569558 -0.015 0.987748
ACGCTA -0.0262610 0.0485016 -0.541 0.588203
ACGCTG 0.0496074 0.0445837 1.113 0.265852
ACGCTT 0.0718369 0.0441195 1.628 0.103482
ACGGAC -0.0360435 0.0707392 -0.510 0.610385
ACGGAT -0.0158453 0.0519194 -0.305 0.760223
ACGGGG -0.0223775 0.0632190 -0.354 0.723364
ACGTCA 0.0108328 0.0452873 0.239 0.810950
ACGTTA -0.0364488 0.0390734 -0.933 0.350913
ACTACC 0.0161833 0.0563816 0.287 0.774090
ACTCTG 0.0481371 0.0496147 0.970 0.331943
ACTGAG 0.2001856 0.0829175 2.414 0.015770 *
ACTGCG -0.0632573 0.0476632 -1.327 0.184458
ACTTAT -0.0011373 0.0338581 -0.034 0.973205
AGAAAA 0.0301105 0.0291131 1.034 0.301020
AGAATA 0.0254565 0.0343763 0.741 0.458985
AGACAA -0.0338396 0.0408985 -0.827 0.408012
AGAGTG 0.0582759 0.0435824 1.337 0.181183
AGATAC 0.0128828 0.0547378 0.235 0.813934
AGATCG 0.0549217 0.0527391 1.041 0.297702
AGATGA 0.0304147 0.0447670 0.679 0.496887
AGATTA 0.0356649 0.0321913 1.108 0.267908
AGATTG 0.0705940 0.0341670 2.066 0.038820 *
AGCACA -0.0693810 0.0479502 -1.447 0.147920
AGCACT -0.0722368 0.0496310 -1.455 0.145543
AGCAGG 0.0818555 0.0458182 1.787 0.074020 .
AGCAGT -0.0301990 0.0483760 -0.624 0.532462
AGCGAA -0.0397309 0.0385753 -1.030 0.303035
AGCGAG -0.1067247 0.0572575 -1.864 0.062336 .
AGCGTT -0.0200157 0.0417412 -0.480 0.631572
AGCTGT 0.1051561 0.0529400 1.986 0.047003 *
AGGAAA 0.0302541 0.0363904 0.831 0.405766
AGGAGT -0.0304315 0.0517310 -0.588 0.556358
AGGCAA 0.0029479 0.0454739 0.065 0.948313
AGGCAG 0.0837007 0.0566365 1.478 0.139453
AGGCCA 0.0649186 0.0626718 1.036 0.300278
AGGCCT -0.0860851 0.0693560 -1.241 0.214536
AGGCGA -0.0898085 0.0516753 -1.738 0.082229 .
AGGGGT 0.0128960 0.0549732 0.235 0.814530
AGGGTC 0.0430534 0.0959635 0.449 0.653691
AGGTAT 0.0248674 0.0424756 0.585 0.558248
AGGTGC 0.0884942 0.0438915 2.016 0.043784 *
AGTATA 0.0989939 0.0416293 2.378 0.017411 *
AGTCAC 0.0233561 0.0558442 0.418 0.675776
AGTCAG -0.0022718 0.0437565 -0.052 0.958593
AGTCGG -0.1518141 0.0664705 -2.284 0.022380 *
AGTGAC -0.0427663 0.0531882 -0.804 0.421369
AGTGCC -0.0094010 0.0534021 -0.176 0.860262
AGTGGG 0.1452407 0.0790235 1.838 0.066077 .
AGTTAG -0.1136120 0.0503138 -2.258 0.023946 *
AGTTCA -0.0007266 0.0395515 -0.018 0.985343
AGTTGG -0.0507000 0.0537018 -0.944 0.345123
ATAAAC -0.0376604 0.0297837 -1.264 0.206070
ATACGC -0.0484596 0.0470040 -1.031 0.302562
ATACTA -0.0078307 0.0494801 -0.158 0.874252
ATACTG -0.0040397 0.0357565 -0.113 0.910048
ATAGAG 0.0248935 0.0527540 0.472 0.637015
ATAGAT -0.0142304 0.0394523 -0.361 0.718327
ATAGGA 0.0039475 0.0734717 0.054 0.957152
ATAGTG 0.0364849 0.0371702 0.982 0.326320
ATATGA -0.0730751 0.0396950 -1.841 0.065640 .
ATCAAA -0.0107741 0.0292909 -0.368 0.713000
ATCACA 0.0429447 0.0408734 1.051 0.293413
ATCAGG -0.0625007 0.0347430 -1.799 0.072034 .
ATCATA -0.0027445 0.0367608 -0.075 0.940488
ATCCCA 0.0646488 0.0486339 1.329 0.183757
ATCCCC -0.1135146 0.0485927 -2.336 0.019493 *
ATCCGG -0.0249805 0.0487685 -0.512 0.608495
ATCGTA 0.0195135 0.0473673 0.412 0.680369
ATCTAT 0.0124991 0.0342187 0.365 0.714910
ATCTCT -0.0188312 0.0423727 -0.444 0.656743
ATCTGT -0.0546275 0.0413095 -1.322 0.186043
ATGATG 0.0009513 0.0330499 0.029 0.977038
ATGCGC -0.0289324 0.0331970 -0.872 0.383466
ATGGAA 0.0121842 0.0356755 0.342 0.732708
ATGGGC 0.1088694 0.0559995 1.944 0.051887 .
ATGGGT -0.0461460 0.0502071 -0.919 0.358041
ATGGTG 0.1335614 0.0477890 2.795 0.005195 **
ATGGTT 0.0315913 0.0358271 0.882 0.377905
ATGTGG 0.1205877 0.0562330 2.144 0.032004 *
ATGTTT -0.0011160 0.0285914 -0.039 0.968865
ATTAGA -0.1192412 0.0494572 -2.411 0.015913 *
ATTCCT -0.0149659 0.0376554 -0.397 0.691043
ATTCGT -0.0817359 0.0410821 -1.990 0.046644 *
ATTCTT -0.0322920 0.0340860 -0.947 0.343456
ATTGCC 0.0318242 0.0351346 0.906 0.365057
ATTGTG 0.1083446 0.0369952 2.929 0.003406 **
ATTGTT -0.0263346 0.0276522 -0.952 0.340923
ATTTCA 0.0369943 0.0283786 1.304 0.192377
ATTTGA 0.0288458 0.0319528 0.903 0.366656
ATTTGC 0.0395107 0.0299031 1.321 0.186410
ATTTGT 0.0275211 0.0299566 0.919 0.358258
ATTTTT 0.0055589 0.0191895 0.290 0.772059
CAAAAA 0.0428849 0.0332410 1.290 0.197015
CAAAAT 0.0273184 0.0309413 0.883 0.377289
CAAACA -0.0517726 0.0382867 -1.352 0.176306
CAAACC -0.0111652 0.0453391 -0.246 0.805483
CAAACT 0.1173320 0.0401261 2.924 0.003456 **
CAAAGC -0.0149532 0.0405287 -0.369 0.712164
CAAATA 0.0594172 0.0303805 1.956 0.050498 .
CAAATT -0.0014698 0.0305009 -0.048 0.961567
CAACAG 0.0894659 0.0408565 2.190 0.028547 *
CAACCA -0.0476060 0.0500942 -0.950 0.341950
CAACGA -0.0822138 0.0536444 -1.533 0.125388
CAAGAA 0.0356044 0.0433301 0.822 0.411250
CAAGAG 0.0390965 0.0615918 0.635 0.525583
CAAGGC 0.0307217 0.0598184 0.514 0.607546
CAAGGT -0.0376725 0.0538850 -0.699 0.484475
CACAAA -0.0496227 0.0389773 -1.273 0.202982
CACAAT 0.0173282 0.0392292 0.442 0.658697
CACACA 0.0533106 0.0486460 1.096 0.273133
CACACT 0.0082194 0.0540850 0.152 0.879210
CACAGA -0.0259867 0.0485818 -0.535 0.592718
CACATG -0.1503289 0.0501335 -2.999 0.002714 **
CACATT -0.0236562 0.0371988 -0.636 0.524818
CACCTC 0.0176576 0.0644218 0.274 0.784013
CACGCT -0.1483944 0.0597872 -2.482 0.013067 *
CACGGC -0.0045647 0.0596433 -0.077 0.938995
CACGTT -0.1152615 0.0468792 -2.459 0.013948 *
CACTAT -0.0062379 0.0431642 -0.145 0.885095
CACTCT -0.0471972 0.0492835 -0.958 0.338236
CACTGC -0.0943948 0.0483261 -1.953 0.050792 .
CACTGG -0.1428106 0.0501310 -2.849 0.004391 **
CAGAAA -0.0431665 0.0310257 -1.391 0.164137
CAGAGA -0.0247082 0.0525077 -0.471 0.637955
CAGAGT -0.0841138 0.0522699 -1.609 0.107575
CAGCAA -0.0026232 0.0323125 -0.081 0.935297
CAGCAG -0.0243111 0.0497447 -0.489 0.625044
CAGCTC 0.0085107 0.0617543 0.138 0.890386
CAGGAC -0.0915517 0.0510952 -1.792 0.073173 .
CAGGAT 0.0252656 0.0356034 0.710 0.477931
CAGGCA -0.0032914 0.0407616 -0.081 0.935644
CAGTCG -0.0016126 0.0630713 -0.026 0.979602
[ reached getOption("max.print") -- omitted 452 rows ]
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.439 on 46929 degrees of freedom
(5917 observations deleted due to missingness)
Multiple R-squared: 0.034, Adjusted R-squared: 0.0206
F-statistic: 2.537 on 651 and 46929 DF, p-value: < 0.00000000000000022
Maybe we can just focus on CRP and try to make a position-specific model. The change in CRP score (let’s say we use an HMM) at each position can be weighted by the average change in expression at the position across the entire library. Or learn the appropriate weights and see how they compare to the average expression change. To do so, compare scrambled to wild-type and note all the k-mers that are different in the scramble and the relative position in the sequence. Link this to change in expression. This input set (position, change in CRP score) and output set (change in expression) can be used for models. Regression?
crp_phmm <- derivePHMM(crp_sites, k = 5)
Deriving profile HMM
Refining model
Iteration 1: alignment with 376 rows & 24 columns, PHMM with 22 modules
Iteration 2: alignment with 376 rows & 24 columns, PHMM with 22 modules
Iteration 3: alignment with 376 rows & 24 columns, PHMM with 22 modules
Error in tmp == hashis : non-conformable arrays
We can view the k-mer change between wild-type and scramble as either a k-mer destroyed in wild-type or a k-mer created in scramble. Let’s go with k-mers that are destroyed in the wild-type.
df <- filter(data, tss_name == 'TSS_10021_storz_wanner_regulondb')
# set wild-type
wt <- filter(df, category == 'unscramble')
wt_seq <- as.character(wt$variant)
vars <- filter(df, category == 'scramble') %>% arrange(scramble_start)
total_phmm_score <- function(seq_list, phmm) {
forward_path <- lapply(seq_list, forward, x = phmm)
log_odds <- unlist(lapply(forward_path, `[[`, 1))
return(sum(log_odds))
}
highest_phmm_match <- function(seq_list, phmm) {
forward_path <- lapply(seq_list, forward, x = phmm)
log_odds <- unlist(lapply(forward_path, `[[`, 1))
index <- which.max(log_odds)
max_log_odds <- log_odds[index]
# best_match <- paste(seq_list[[index]], collapse = '')
return(c(max_log_odds, index))
}
scan_phmms <- function(sequence, phmms, score_type) {
# generate tiles
n <- phmms[[1]]$size
starts <- seq(nchar(sequence) - n + 1)
tiles <- mapply(substr, start = starts, stop = starts + n - 1,
MoreArgs = list(x = sequence))
tiles_list <- strsplit(tiles, split = '')
if(score_type == 'sum') {
score <- lapply(phmms, total_phmm_score, seq_list = tiles_list)
}
else if(score_type == 'max') {
score <- lapply(phmms, highest_phmm_match, seq_list = tiles_list)
}
return(score)
}
phmms <- list(crp_phmm)
vars <- vars %>%
rowwise() %>%
mutate(total_crp_phmm_score = unlist(scan_phmms(variant,
phmms = phmms,
score_type = 'sum')))
wt_score <- unlist(scan_phmms(wt_seq, phmms, score_type = 'sum'))
vars$delta_phmm_score <- vars$total_crp_phmm_score - wt_score
Let’s look at this test example and see how the change in CRP score compares with relative expression (scrambled expression / wild-type expression)
cor.test(vars$relative_exp, vars$delta_phmm_score)
Pearson's product-moment correlation
data: vars$relative_exp and vars$delta_phmm_score
t = 1.1325, df = 26, p-value = 0.2678
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.1700137 0.5457475
sample estimates:
cor
0.2168154
Hmm there does seem to be a connection. If the scramble increases the score, expression goes up. The correlation is mild, but it’s not significant here.
vars <- vars %>%
mutate(delta_expression = RNA_exp_ave - unscrambled_exp)
ggplot(vars, aes(delta_phmm_score, delta_expression)) + geom_point()
cor.test(log(vars$delta_expression), vars$delta_phmm_score)
NaNs produced
Pearson's product-moment correlation
data: log(vars$delta_expression) and vars$delta_phmm_score
t = 1.8939, df = 17, p-value = 0.07538
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.04540256 0.73270737
sample estimates:
cor
0.4174148
Correlation is better with log.
Let’s wrap this all into one function that can be executed on data grouped by TSS name
phmms <- list(crp_phmm)
data <- data %>%
rowwise() %>%
mutate(total_crp_phmm_score = unlist(scan_phmms(variant,
phmms = phmms,
score_type = 'sum'))) %>%
ungroup()
# calculate difference between scramble and wild-type
data <- data %>%
group_by(tss_name) %>%
mutate(wt_total_crp_phmm_score = ifelse(any(category == 'unscramble'),
total_crp_phmm_score[category == 'unscramble'],
NA),
delta_phmm_score = total_crp_phmm_score - wt_total_crp_phmm_score) %>%
ungroup()
data <- data %>%
mutate(delta_expression = RNA_exp_ave - unscrambled_exp)
ggplot(data, aes(delta_phmm_score, delta_expression)) + geom_point(alpha = 0.25)
cor.test(data$delta_expression, data$delta_phmm_score)
Pearson's product-moment correlation
data: data$delta_expression and data$delta_phmm_score
t = 1.4881, df = 47579, p-value = 0.1367
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.002163488 0.015806306
sample estimates:
cor
0.00682196
What if we looked at the difference in the log transformed expression?
tmp <- data %>%
mutate(log_exp = log(RNA_exp_ave),
wt_log_exp = log(unscrambled_exp),
delta_log_exp = log_exp - wt_log_exp)
cor.test(tmp$delta_log_exp, tmp$delta_phmm_score)
Pearson's product-moment correlation
data: tmp$delta_log_exp and tmp$delta_phmm_score
t = 4.4709, df = 47579, p-value = 0.000007805
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.01150958 0.02947267
sample estimates:
cor
0.02049278
Hmm, interesting the correlation does improve and is significant now, even though it’s really small.
ggplot(data, aes(as.character(scramble_start), delta_phmm_score)) + geom_boxplot() +
labs(x = 'scramble start position', y = 'change in CRP score') +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
data %>%
filter(category == 'scramble') %>%
select(scramble_start, delta_phmm_score, delta_expression) %>%
lm(delta_expression ~ . , data = .) %>%
summary()
Call:
lm(formula = delta_expression ~ ., data = .)
Residuals:
Min 1Q Median 3Q Max
-49.010 0.025 0.184 0.323 29.380
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.1620857 0.0210336 -7.706 0.0000000000000132 ***
scramble_start -0.0010427 0.0002577 -4.047 0.0000520487610120 ***
delta_phmm_score 0.0022791 0.0016293 1.399 0.162
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.304 on 45782 degrees of freedom
(5917 observations deleted due to missingness)
Multiple R-squared: 0.000401, Adjusted R-squared: 0.0003573
F-statistic: 9.182 on 2 and 45782 DF, p-value: 0.0001031
data %>%
filter(category == 'scramble') %>%
mutate(log_exp = log(RNA_exp_ave),
wt_log_exp = log(unscrambled_exp),
delta_log_exp = log_exp - wt_log_exp) %>%
select(scramble_start, delta_phmm_score, delta_log_exp) %>%
lm(delta_log_exp ~ . , data = .) %>%
summary()
Call:
lm(formula = delta_log_exp ~ ., data = .)
Residuals:
Min 1Q Median 3Q Max
-7.6194 -0.2474 0.1032 0.3788 4.2649
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.08997054 0.00718880 -12.515 < 0.0000000000000002 ***
scramble_start -0.00179039 0.00008807 -20.329 < 0.0000000000000002 ***
delta_phmm_score 0.00234632 0.00055685 4.214 0.0000252 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7875 on 45782 degrees of freedom
(5917 observations deleted due to missingness)
Multiple R-squared: 0.009338, Adjusted R-squared: 0.009294
F-statistic: 215.8 on 2 and 45782 DF, p-value: < 0.00000000000000022
Model is marginally better using log transformed data, which is interesting.
Let’s find the location of the best CRP match for each wild-type sequence.
wt_sequences <- as.list(filter(data, category == 'unscramble') %>% .$variant)
wt_phmm_hits <- lapply(wt_sequences, scan_phmms, phmms = phmms, score_type = 'max')
# unlist the sublists
wt_phmm_hits <- lapply(wt_phmm_hits, unlist)
Now that we know where the best CRP site is for each WT, let’s create a new data frame that lists, for each WT sequence, the scrambled variants that overlap the site and the change in expression between WT and scrambled.
# source("http://bioconductor.org/biocLite.R")
# biocLite("IRanges")
library(IRanges)
Loading required package: BiocGenerics
Loading required package: parallel
Attaching package: ‘BiocGenerics’
The following objects are masked from ‘package:parallel’:
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, clusterExport, clusterMap,
parApply, parCapply, parLapply, parLapplyLB, parRapply, parSapply, parSapplyLB
The following objects are masked from ‘package:dplyr’:
combine, intersect, setdiff, union
The following objects are masked from ‘package:stats’:
IQR, mad, sd, var, xtabs
The following objects are masked from ‘package:base’:
anyDuplicated, append, as.data.frame, basename, cbind, colMeans, colnames, colSums,
dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
is.unsorted, lapply, lengths, Map, mapply, match, mget, order, paste, pmax, pmax.int,
pmin, pmin.int, Position, rank, rbind, Reduce, rowMeans, rownames, rowSums, sapply,
setdiff, sort, table, tapply, union, unique, unsplit, which, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: ‘S4Vectors’
The following object is masked from ‘package:tidyr’:
expand
The following objects are masked from ‘package:dplyr’:
first, rename
The following object is masked from ‘package:base’:
expand.grid
Attaching package: ‘IRanges’
The following objects are masked from ‘package:dplyr’:
collapse, desc, slice
# assign position relative to TSS and with respect to strand
wt_df <- wt_df %>%
mutate(wt_best_crp_score_pos_relative = case_when(.$strand == '+' ~ wt_best_crp_score_pos - 121,
.$strand == '-' ~ 150 - wt_best_crp_score_pos - 120,
TRUE ~ NaN))
ggplot(wt_df, aes(factor(wt_best_crp_score_pos_relative), delta_log_exp)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
labs(x = 'start position of best WT CRP site',
y = 'log expression scramble - \n log expression WT')
cor.test(wt_df$wt_best_crp_score_pos_relative, wt_df$delta_log_exp)
Pearson's product-moment correlation
data: wt_df$wt_best_crp_score_pos_relative and wt_df$delta_log_exp
t = 0.68491, df = 8412, p-value = 0.4934
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.01390249 0.02883056
sample estimates:
cor
0.007467443
Not all of the WT sequences are actually regulated by CRP and could be contributing noise. Let’s assign a significance to each CRP score and filter out sequences that do not have strong sites. Let’s take a random sample of the genome and plot the distribution of CRP HMM scores.
bedtools random -l 22 -n 10000 -seed 123 -g ../../ref/Escherichia_coli_K-12_MG1655_genomeFile.txt > ecoli_genome_random_22bp.bed
bedtools getfasta -fi ../../ref/Escherichia_coli_K-12_MG1655.fasta -bed ecoli_genome_random_22bp.bed -fo ecoli_genome_random_22bp.txt -tab -s
genome_random <- read.table('ecoli_genome_random_22bp.txt', header = F,
col.names = c('name', 'sequence'))
random_sequences <- strsplit(genome_random$sequence, '')
random_phmm_hits <- lapply(random_sequences, forward, x = crp_phmm)
# extract just score
random_phmm_hits <- unlist(lapply(random_phmm_hits, `[[`, 1))
ggplot(data.frame(score = random_phmm_hits), aes(score)) + geom_density() +
labs(x = 'CRP HMM score', title = '100,000 random genomic sequences 22bp')
Calculate permutation p-value: B / m B : how many observations from the random genomic distribution have a value at least as extreme m: number of tests
This can also be thought of as 1 - percentile. Example, 20% percentile is value below which 20% of observations may be found.
random_hits_ecdf <- ecdf(random_phmm_hits)
wt_df <- wt_df %>%
mutate(emp_pval = 1 - random_hits_ecdf(wt_best_crp_score))
ggplot(wt_df, aes(wt_best_crp_score)) + geom_density()
ggplot(wt_df, aes(emp_pval)) + geom_density() +
geom_vline(xintercept = 0.05, color = 'red')
It seems like the distribution of CRP scores for our library is much different than the genomic distribution. Maybe promoters in general are more likely to have at least one high scoring CRP site compared to random genomic sequence (which would fall in genes too). So, most of these WT sequences would have a significant CRP site with this current “null” genomic distribution. Or, maybe scanning the 150bp chunks with the 22bp HMM is just much more sampling, so more likely to get a higher score. Let’s grab 150bp random chunks from the genome and see if it changes anything.
bedtools random -l 150 -n 10000 -seed 123 -g ../../ref/Escherichia_coli_K-12_MG1655_genomeFile.txt > ecoli_genome_random_150bp.bed
bedtools getfasta -fi ../../ref/Escherichia_coli_K-12_MG1655.fasta -bed ecoli_genome_random_150bp.bed -fo ecoli_genome_random_150bp.txt -tab -s
genome_random150 <- read.table('ecoli_genome_random_150bp.txt', header = F,
col.names = c('name', 'sequence'))
random150_sequences <- as.list(genome_random150$sequence)
random150_phmm_hits <- lapply(random150_sequences, scan_phmms, phmms = phmms, score_type = 'max')
# unlist
random150_phmm_hits <- lapply(random150_phmm_hits, `[[`, 1)
random150_phmm_hits_score <- unlist(lapply(random150_phmm_hits, `[[`, 1))
ggplot(data.frame(score = random150_phmm_hits_score), aes(score)) + geom_density() +
labs(x = 'CRP HMM score', title = '100,000 random genomic sequences 150bp') +
geom_density(data = wt_df, aes(wt_best_crp_score), color = 'red')
random_hits_ecdf <- ecdf(random150_phmm_hits_score)
wt_df <- wt_df %>%
mutate(emp_pval = 1 - random_hits_ecdf(wt_best_crp_score))
ggplot(wt_df, aes(emp_pval)) + geom_density() +
geom_vline(xintercept = 0.05, color = 'red')
Much better!
wt_df_crp <- filter(wt_df, emp_pval <= .05)
table(wt_df_crp$category)
scramble unscramble
992 270
table(data$category)
scramble unscramble
51702 1796
About 15% of WT (unscrambled) sequences contain a CRP site with an empirical p-value of 0.05.
Let’s look at the start position of the WT best CRP score vs. change in logged expression for only the WTs (and their overlapping scrambled variants) with significant CRP sites.
ggplot(wt_df_crp, aes(factor(wt_best_crp_score_pos_relative), delta_log_exp)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
geom_hline(yintercept = 0, linetype = 'dashed') +
labs(x = 'start position of best WT CRP site',
y = 'log expression scramble - \n log expression WT',
title = 'WT with significant CRP site')
wt_df_crp %>%
mutate(score_fact = factor(wt_best_crp_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp),
n = n()) %>%
ggplot(aes(score_fact, mean_delta)) + geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
labs(x = 'start position of best WT CRP site',
y = 'mean(log expression scramble - \n log expression WT)')
wt_df_crp %>%
mutate(score_fact = factor(wt_best_crp_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp)) %>%
mutate(pos = as.numeric(levels(score_fact))) %>%
ggplot(aes(pos, mean_delta)) + geom_point() +
# geom_smooth(span = 0.1) +
geom_line() +
geom_hline(yintercept = 0, linetype = 'dashed')
Let’s get the mean change in log expression at each position, before we filter out for CRP significance. Then, we can normalize to this number to account for any positions that are sensitive to scrambling in general.
wt_df_mean_pos <- wt_df %>%
mutate(score_fact = factor(wt_best_crp_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta_all = mean(delta_log_exp))
wt_df_crp_mean_pos <- wt_df_crp %>%
mutate(score_fact = factor(wt_best_crp_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp))
wt_df_mean_pos_all <- full_join(wt_df_mean_pos, wt_df_crp_mean_pos, by = 'score_fact') %>%
mutate(mean_delta_norm = mean_delta - mean_delta_all,
score_fact = factor(score_fact))
Column `score_fact` joining factors with different levels, coercing to character vector
ggplot(wt_df_mean_pos_all, aes(reorder(score_fact, sort(as.numeric(score_fact))), mean_delta_norm)) + geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 90, size = 10)) +
labs(x = 'start position of best WT CRP site',
y = 'normalized mean(log expression scramble - \n log expression WT)')
Looks almost the same, even after subtracting the average expression change for all WT sequences (including those that didn’t pass CRP filter).
Let’s put all of this functionality in one place, so we can generate these graphs for a TF of our choice.
phmm <- derivePHMM(sites_filtered, k = 5)
Deriving profile HMM
Refining model
Iteration 1: alignment with 104 rows & 15 columns, PHMM with 15 modules
Sequential alignments were identical after 1 iterations
Done
tf <- 'Fis'
wt_df_fis <- score_tf_phmm(tf, tf_sites, data, random150_sequences)
Deriving profile HMM
Refining model
Iteration 1: alignment with 243 rows & 15 columns, PHMM with 15 modules
Sequential alignments were identical after 1 iterations
Done
[1] "All TF hits in WT: 53502"
[1] "All significant TF hits in WT: 1204"
ggplot(wt_df_fis, aes(factor(wt_best_tf_score_pos_relative), delta_log_exp)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
labs(x = paste('start position of best WT', tf, 'site'),
y = 'log expression scramble - \n log expression WT')
wt_df_fis %>%
mutate(score_fact = factor(wt_best_tf_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp)) %>%
mutate(pos = as.numeric(levels(score_fact))) %>%
ggplot(aes(pos, mean_delta)) + geom_point() +
# geom_smooth(span = 0.1) +
geom_line() +
geom_hline(yintercept = 0, linetype = 'dashed')
crp_mean_line <- wt_df_crp %>%
mutate(score_fact = factor(wt_best_crp_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp)) %>%
mutate(pos = as.numeric(levels(score_fact)))
fis_mean_line <- wt_df_fis %>%
mutate(score_fact = factor(wt_best_tf_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp)) %>%
mutate(pos = as.numeric(levels(score_fact)))
ggplot(aes(pos, mean_delta), data=crp_mean_line) + geom_point() +
# geom_smooth(span = 0.1) +
geom_line() +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_line(aes(pos, mean_delta), data=fis_mean_line, color='blue') +
scale_x_continuous(breaks=seq(-120, 30, 10)) +
geom_vline(xintercept = seq(-120, 30, 10), linetype='dashed') +
labs(x = 'position relative to TSS',
y = 'log expression scramble - \n log expression WT')
ggplot(aes(pos, mean_delta), data=crp_mean_line) +
geom_smooth(span = 0.1, color='black') +
geom_hline(yintercept = 0, linetype = 'dashed') +
scale_x_continuous(breaks=seq(-120, 30, 10)) +
geom_vline(xintercept = seq(-120, 30, 10), linetype='dashed') +
labs(x = 'position relative to TSS',
y = 'mean log expression scramble - \n log expression WT',
title = 'WT with significant CRP site')
ggplot(aes(pos, mean_delta), data=crp_mean_line) +
geom_smooth(span = 0.1, color='black') +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_smooth(aes(pos, mean_delta), data=fis_mean_line, color='blue', span=0.1) +
scale_x_continuous(breaks=seq(-120, 30, 10)) +
geom_vline(xintercept = seq(-120, 30, 10), linetype='dashed') +
labs(x = 'position relative to TSS',
y = 'mean log expression scramble - \n log expression WT')
tf <- 'ArcA'
wt_df_arca <- score_tf_phmm(tf, tf_sites, data, random150_sequences)
Deriving profile HMM
Refining model
Iteration 1: alignment with 104 rows & 15 columns, PHMM with 15 modules
Sequential alignments were identical after 1 iterations
Done
[1] "All TF hits in WT: 53502"
[1] "All significant TF hits in WT: 0"
ggplot(wt_df_arca, aes(factor(wt_best_tf_score_pos_relative), delta_log_exp)) + geom_boxplot() +
theme(axis.text.x = element_text(angle = 90, size = 8)) +
labs(x = paste('start position of best WT', tf, 'site'),
y = 'log expression scramble - \n log expression WT')
arca_mean_line <- wt_df_arca %>%
mutate(score_fact = factor(wt_best_tf_score_pos_relative)) %>%
group_by(score_fact) %>%
summarise(mean_delta = mean(delta_log_exp)) %>%
mutate(pos = as.numeric(levels(score_fact)))
ggplot(aes(pos, mean_delta), data=fis_mean_line) +
geom_smooth(span = 0.1, color='blue') +
geom_hline(yintercept = 0, linetype = 'dashed') +
geom_smooth(aes(pos, mean_delta), data=arca_mean_line, color='red', span=0.1) +
scale_x_continuous(breaks=seq(-120, 30, 10)) +
geom_vline(xintercept = seq(-120, 30, 10), linetype='dashed') +
labs(x = 'position relative to TSS',
y = 'log expression scramble - \n log expression WT')